Collaboration study with Deepa

Generating T-cell signature gene list

Last time I generated the T-cell signature file, I used all HLA genes and sex chromosome genes. The result showed some childrend do not have one paticular cluter. Then, I checked this cluster carefully and found that some ethnic group does not have certain HLA. The T-cell dataset I used was pooled sample analysis. This suggests that we have to eliminate the HLA, mitochondoria, ribozomal and genes on the sex chromosomes.

CD4+/CD45RO+ Memory T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/memory_t

CD4+/CD45RA+/CD25- Naive T cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/naive_t

CD4+/CD25+ Regulatory T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/regulatory_t

CD4+ Helper T Cells https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/cd4_t_helper

In [1]:
library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)
setwd("/Volumes/home/msuzuki/scRNA")
Loading required package: ggplot2
Loading required package: cowplot

Attaching package: ‘cowplot’

The following object is masked from ‘package:ggplot2’:

    ggsave

Loading required package: Matrix

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

1. Loading data

In [3]:
memory_t.data <- Read10X("memory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = memory_t.data), value = TRUE)
gene_name <-rownames(memory_t.data)
'%ni%' <- Negate('%in%')
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(memory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(memory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(memory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(memory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(memory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(memory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(memory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("~/scRNA_seq/sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
memory_t.data2<-memory_t.data[rownames(memory_t.data)%in%not_ribo_auto,]

memory_t <- CreateSeuratObject(raw.data = memory_t.data2, min.cells = 3, min.genes = 200, project = "memory_t")

naive_t.data <- Read10X("naive_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naive_t.data), value = TRUE)
gene_name <-rownames(naive_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(naive_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(naive_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(naive_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(naive_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(naive_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(naive_t.data), value = T)
hla.genes <-grep("^HLA", rownames(naive_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))

not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naive_t.data2<-naive_t.data[rownames(naive_t.data)%in%not_ribo_auto,]


naive_t <- CreateSeuratObject(raw.data = naive_t.data2, min.cells = 3, min.genes = 200, project = "naive_t")

regulatory_t.data <- Read10X("regulatory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = regulatory_t.data), value = TRUE)
gene_name <-rownames(regulatory_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))

not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
regulatory_t.data2<-regulatory_t.data[rownames(regulatory_t.data)%in%not_ribo_auto,]



regulatory_t <- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "regulatory_t")

memory_t@meta.data[, "protocol"] <- "memory_t"
naive_t@meta.data[, "protocol"] <- "naive_t"
regulatory_t@meta.data[, "protocol"] <- "regulatory_t"

2. Identifying genes to be excluded and remove from the dataset

In [4]:
mito.genes <- grep("^MT-", x = rownames(x = T_nmr@data), value = TRUE)
percent.mito <- Matrix::colSums(T_nmr@raw.data[mito.genes, ])/Matrix::colSums(T_nmr@raw.data)
T_nmr <- AddMetaData(T_nmr, percent.mito, "percent.mito")


'%ni%' <- Negate('%in%')
gene_name <-rownames(T_nmr@data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(T_nmr@data), value = T)
ribo2.genes <- grep("^RPL", rownames(T_nmr@data), value = T)
ribo.genes <-c(ribo1.genes,ribo2.genes)
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("~/scRNA_seq/sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
Error in rownames(x = T_nmr@data): object 'T_nmr' not found
Traceback:

1. grep("^MT-", x = rownames(x = T_nmr@data), value = TRUE)
2. grep("^MT-", x = rownames(x = T_nmr@data), value = TRUE)
3. rownames(x = T_nmr@data)

3. Finding variable genes

In [68]:
T_nm <-MergeSeurat(naive_t,memory_t,add.cell.id1 = "T_n",add.cell.id2 ="T_m")
T_nmr <-MergeSeurat(T_nm,regulatory_t,add.cell.id2 ="T_r")
T_nmr <- FilterCells(object = T_nmr, subset.names = c("nGene"), 
                      low.thresholds = c(200), high.thresholds = c(3000))
T_nmr <- NormalizeData(object = T_nmr)
T_nmr <- ScaleData(object = T_nmr,vars.to.regress=c("nGene","nUMI"))
T_nmr <- FindVariableGenes(object = T_nmr, mean.function = ExpMean, dispersion.function = LogVMR, 
                            x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
saveRDS(T_nmr,file = "T_nmr2.RDS")
[1] "Regressing out nGene" "Regressing out nUMI" 
  |======================================================================| 100%
[1] "Scaling data matrix"
  |======================================================================| 100%
Warning message in gzfile(file, mode):
“cannot open compressed file 'T_nmr2.RDS', probable reason 'No such file or directory'”
Error in gzfile(file, mode): cannot open the connection
Traceback:

1. saveRDS(T_nmr, file = "T_nmr2.RDS")
2. gzfile(file, mode)
In [69]:
length(rownames(T_nmr@data))
12959
In [70]:
cat("Identified variable genes across the single cells = ", length(x = T_nmr@var.genes))
T_nmr <- RunPCA(object = T_nmr, pc.genes = T_nmr@var.genes, do.print = TRUE, pcs.print = 1:5, 
                 genes.print = 5,pcs.compute = 20)
Identified variable genes across the single cells =  1434[1] "PC1"
[1] "LGALS1"  "S100A10" "DUSP1"   "ANXA2"   "KLF6"   
[1] ""
[1] "CCR7"   "SELL"   "CD7"    "LEF1"   "TMIGD2"
[1] ""
[1] ""
[1] "PC2"
[1] "FANK1"     "LINC00152" "TIGIT"     "IL2RA"     "CPNE2"    
[1] ""
[1] "CCL5" "FOS"  "GZMK" "IL7R" "GZMA"
[1] ""
[1] ""
[1] "PC3"
[1] "VIM"    "FOS"    "CAPG"   "LGALS1" "DUSP1" 
[1] ""
[1] "GZMK" "CCL5" "GZMA" "NKG7" "CST7"
[1] ""
[1] ""
[1] "PC4"
[1] "JUN"   "ZFP36" "DUSP1" "BTG2"  "FOS"  
[1] ""
[1] "LGALS1"  "IFITM2"  "S100A6"  "S100A11" "NOSIP"  
[1] ""
[1] ""
[1] "PC5"
[1] "KLRB1"  "GIMAP7" "TRADD"  "IL7R"   "AQP3"  
[1] ""
[1] "CCR10"    "CXCR4"    "CD7"      "C1orf162" "CPNE2"   
[1] ""
[1] ""
In [71]:
options(repr.plot.width = 8, repr.plot.height = 4)
PCElbowPlot(T_nmr, num.pc = 20)
PCAPlot(object = T_nmr, dim.1 = 1, dim.2 = 2)
In [72]:
options(repr.plot.width = 8, repr.plot.height = 8)
PCHeatmap(object = T_nmr, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, 
          label.columns = FALSE, use.full = FALSE)
In [73]:
saveRDS(T_nmr,file = "~/scRNA_seq/T_nmr2.RDS")

Based on the PCA results, I decided to use PC1 to PC10 for TSENE.

4. Finding clusters

In [75]:
T_nmr <- RunTSNE(T_nmr, dims.use = 1:10, do.fast = T)
T_nmr <- FindClusters(object = T_nmr,reduction.type = "pca", dims.use = 1:10, resolution = 0.6, save.SNN = T,force.recalc=T)

PrintFindClustersParams(object = T_nmr)
[1] "Constructing SNN"
  |======================================================================| 100%
Parameters used in latest FindClusters calculation run on: 2018-01-05 19:59:17
=============================================================================
Resolution: 0.6
-----------------------------------------------------------------------------
Modularity Function    Algorithm         n.start         n.iter
     1                   1                 100             10
-----------------------------------------------------------------------------
Reduction used          k.param          k.scale          prune.SNN
     pca                 30                25              0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10

In [76]:
options(repr.plot.width = 6, repr.plot.height = 6)

p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)

May 10th, 2018

To save the images with higher resolution

In [5]:
library(Seurat)
library(dplyr)
library(Matrix)
library(cowplot)

load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")

PrintFindClustersParams(object = T_nmr)
Parameters used in latest FindClusters calculation run on: 2018-01-05 14:59:17
=============================================================================
Resolution: 0.6
-----------------------------------------------------------------------------
Modularity Function    Algorithm         n.start         n.iter
     1                   1                 100             10
-----------------------------------------------------------------------------
Reduction used          k.param          prune.SNN
     pca                 30                0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10

In [6]:
#pdf(file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/TSNE_plots.pdf",width = 6,height = 6)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
#dev.off()
In [77]:
options(repr.plot.width = 8, repr.plot.height = 4)
VlnPlot(object = T_nmr, features.plot = c("CCR7", "CD7"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("TMIGD2", "SELL"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("LEF1", "LGALS1"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("S100A10", "ANXA2"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("DUSP1","CLIC1"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("CCL5", "HOPX"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("GZMA", "GZMK"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("FOS","IL7R"),point.size.use = 0.01) 
VlnPlot(object = T_nmr, features.plot = c("TIGIT","IL2RA"),point.size.use = 0.01)
VlnPlot(object = T_nmr, features.plot = c("NKG7","CCR10"),point.size.use = 0.01)
In [78]:
Tcell.markers <- FindAllMarkers(T_nmr, only.pos = TRUE, min.pct = 0.3,test.use="bimod",logfc.threshold = 0.32)
Tcell.markers2 <-subset(Tcell.markers,Tcell.markers$p_val_adj<0.0001)
write.table(Tcell.markers2,file="~/scRNA_seq/Tcell.markers_auto.csv",sep=",")
head(Tcell.markers)
cat("Number of the marker gene = ",length(unique(Tcell.markers2$gene)))
p_valavg_logFCpct.1pct.2p_val_adjclustergene
CCR70 0.98473260.381 0.121 0 0 CCR7
LEF10 0.75827520.376 0.162 0 0 LEF1
SELL0 0.73953560.511 0.245 0 0 SELL
CD70 0.69995030.646 0.374 0 0 CD7
NOSIP0 0.46965520.738 0.538 0 0 NOSIP
TMEM660 0.43675750.831 0.678 0 0 TMEM66
Number of the marker gene =  348
In [85]:
save(T_nmr,file = "~/scRNA_seq/T_nmr2.Robj")

Generating signature gene profile

I calculated the median of signature genes in each cell cluster.

In [79]:
normalized<- data.frame(T_nmr@scale.data)
celltypes <-data.frame(T_nmr@ident)
geneLIST<-c(Tcell.markers2$gene)

sig_gene<-normalized[which(row.names(normalized)%in%geneLIST),]
Cluster_0<-sig_gene[,which(celltypes==0)]
Cluster_1<-sig_gene[,which(celltypes==1)]
Cluster_2<-sig_gene[,which(celltypes==2)]
Cluster_3<-sig_gene[,which(celltypes==3)]
Cluster_4<-sig_gene[,which(celltypes==4)]
Cluster_5<-sig_gene[,which(celltypes==5)]
Cluster_6<-sig_gene[,which(celltypes==6)]
#Cluster_7<-sig_gene[,which(celltypes==7)]
#Cluster_8<-sig_gene[,which(celltypes==8)]
#Cluster_9<-sig_gene[,which(celltypes==9)]
#Cluster_10<-sig_gene[,which(celltypes==10)]
#Cluster_11<-sig_gene[,which(celltypes==11)]
#Cluster_12<-sig_gene[,which(celltypes==12)]
#Cluster_13<-sig_gene[,which(celltypes==13)]
#Cluster_14<-sig_gene[,which(celltypes==14)]

j=nrow(sig_gene)
signature_data <-matrix(nrow=j,ncol=7)
for(i in 1:j)
  signature_data[i,1]<-2^median(as.double(Cluster_0[i,]))
for(i in 1:j)
  signature_data[i,2]<-2^median(as.double(Cluster_1[i,]))
for(i in 1:j)
  signature_data[i,3]<-2^median(as.double(Cluster_2[i,]))
for(i in 1:j)
  signature_data[i,4]<-2^median(as.double(Cluster_3[i,]))
for(i in 1:j)
  signature_data[i,5]<-2^median(as.double(Cluster_4[i,]))
for(i in 1:j)
  signature_data[i,6]<-2^median(as.double(Cluster_5[i,]))
for(i in 1:j)
  signature_data[i,7]<-2^median(as.double(Cluster_6[i,]))
#for(i in 1:j)
#  signature_data[i,8]<-2^median(as.double(Cluster_7[i,]))
#for(i in 1:j)
#  signature_data[i,9]<-2^median(as.double(Cluster_8[i,]))
#for(i in 1:j)
#  signature_data[i,10]<-2^median(as.double(Cluster_9[i,]))
#for(i in 1:j)
#  signature_data[i,11]<-2^median(as.double(Cluster_10[i,]))
#for(i in 1:j)
#  signature_data[i,12]<-2^median(as.double(Cluster_11[i,]))
#for(i in 1:j)
#  signature_data[i,13]<-2^median(as.double(Cluster_12[i,]))
#for(i in 1:j)
#  signature_data[i,14]<-10^median(as.double(Cluster_13[i,]))
#for(i in 1:j)
#  signature_data[i,15]<-10^median(as.double(Cluster_14[i,]))

colnames(signature_data)<-c("cluster_0","cluster_1","cluster_2","cluster_3","cluster_4","cluster_5","cluster_6")
                            #,"cluster_7","cluster_8","cluster_9")
                            #,"cluster_10","cluster_11","cluster_12")
                            #,"cluster_13","cluster_14")
row.names(signature_data)<-row.names(sig_gene)

write.table(signature_data,"~/scRNA_seq/singarture_scRNA_Tcell_auto.txt",sep="\t",row.names = T,col.names = T,quote=F)
In [13]:
library(gplots)
library(RColorBrewer)
library(dendextend)
library(ggplot2)
library(biomaRt)
library(reshape2)
library(ggdendro)
library(scales)
In [80]:
Est_CellType <-read.csv(file="~/Documents/Tcell_scRNAseq/CIBERSORT.Output_Job41.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",53),rep("OA",46))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:7])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
In [81]:
group <-c(rep("A",53),rep("OA",46))
pdata <-cbind(Est_CellType[,1:7],group)
dat.m<-melt(pdata,measure.vars =c('cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','cluster_5','cluster_6'),  id.vars = 'group')
p<-ggplot(dat.m)+
  geom_boxplot(aes(color=group,y=value,x=variable))
p
In [82]:
pdata$sample <- row.names(pdata)
dat.m<-melt(pdata[,c(1:7,9)],id.vars = 'sample')

p <- ggplot(dat.m, aes(sample, value, fill = variable)) +
    geom_bar(position = "fill", stat = "identity") +
    theme(axis.text=element_text(size=6))
p
In [84]:
classes1=as.factor(pdata$group)
test<-pdata[,1:7]
stats_val <-matrix(ncol=4,nrow=7)
for(i in 1:7){
  stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
  stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata)[1:7]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
Median_asthmaMedian_obese_asthmaDiff(median, A-OAg)pvalue(wilcox)
cluster_00.24876943 0.22376571 0.0250037150.02499174
cluster_10.13161950 0.14957356 -0.0179540520.06350140
cluster_20.34811205 0.39903254 -0.0509204950.56265720
cluster_30.00000000 0.00000000 0.0000000000.30735110
cluster_40.14302818 0.12854396 0.0144842190.15738616
cluster_50.00000000 0.00000000 0.0000000000.29227641
cluster_60.09304928 0.09602318 -0.0029738980.20539070

Feb 26, 2018

Finding markers between each cluster

This time, I used FindMarkers function instead of FindAllMarkers function.

In [3]:
load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")
In [4]:
options(repr.plot.width = 6, repr.plot.height = 6)

p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
In [8]:
C1_3.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 1, ident.2 = 3)
C4.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 4)

write.table(C1_3.markers,file="~/scRNA_seq/C1_3.markers.csv",sep=",")
write.table(C4.markers,file="~/scRNA_seq/C4.markers.csv",sep=",")

Apr 30, 2018

Finding markers

Deepa asked me to gererate maker gene lists of following combinations.

  1. C0 to all other cell types
  2. C1 to all other cell types
  3. C2 to all other cell types
  4. C3 to all other cell types
In [4]:
C0.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 0,logfc.threshold = 0.2)
In [5]:
#C0.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 0)
C1.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 1,logfc.threshold = 0.2)
C2.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 2,logfc.threshold = 0.2)
C3.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 3,logfc.threshold = 0.2)

write.table(C0.markers,file="~/scRNA_seq/C0.markers.csv",sep=",")
write.table(C1.markers,file="~/scRNA_seq/C1.markers.csv",sep=",")
write.table(C2.markers,file="~/scRNA_seq/C2.markers.csv",sep=",")
write.table(C3.markers,file="~/scRNA_seq/C3.markers.csv",sep=",")

May 10th, 2018

Deepa and Andrew found issues on some expression data, and they decided to remove those samples from the analysis. I obtained a new file from Deepa to perform the cell subtype proportion analysis.

Expression data loading

In [6]:
Exp <-read.table("~/scRNA_seq/Deepa/normcounts_allexcept_AJqc2",sep="\t",header=F)
colnames(Exp)<- c("ENSEMBL","a1", "a2", "a3", "a4", "a5", "a6", "a7", "a9", "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", "a30", "a31", "a32", "a33", "a34", "a36", "a37", "a38", "a39", "a40", "a41", "a42", "a43", "a44", "a45", "a46", "a48", "a49", "a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", "a60", "a61", "oa1", "oa2", "oa5", "oa6", "oa7", "oa8", "oa9", "oa10", "oa11", "oa12", "oa13", "oa14", "oa15", "oa16", "oa18", "oa19", "oa20", "oa21", "oa22", "oa23", "oa24", "oa26", "oa27", "oa28", "oa29", "oa30", "oa31", "oa32", "oa33", "oa34", "oa35", "oa36", "oa37", "oa38", "oa39", "oa40", "oa41", "oa42", "oa43", "oa44", "oa45", "oa46", "oa47", "oa48", "oa49", "oa50", "oa51", "oa53")
Exp[1:3,1:4]
ENSEMBLa1a2a3
ENSG000002239720.000000 0.00000 0.4784112
ENSG000002272320.000000 0.00000 0.0000000
ENSG000002782671.398628 2.04691 0.0000000

I converted EnsID to gene symbol.

In [5]:
library(clusterProfiler)
library("org.Hs.eg.db")
Exp_gene <- bitr(Exp$EnsID, fromType = "ENSEMBL",
                toType = c("SYMBOL"),
                OrgDb = org.Hs.eg.db)
head(Exp_gene)
'select()' returned 1:many mapping between keys and columns
Warning message in bitr(Exp$EnsID, fromType = "ENSEMBL", toType = c("SYMBOL"), OrgDb = org.Hs.eg.db):
“57.91% of input gene IDs are fail to map...”
ENSEMBLSYMBOL
1ENSG00000223972DDX11L1
3ENSG00000278267MIR6859-1
6ENSG00000237613FAM138A
9ENSG00000186092OR4F5
10ENSG00000238009LOC100996442
21ENSG00000273874MIR6859-2
In [7]:
exp <-merge(Exp_gene,Exp,by="ENSEMBL")
head(exp)
ENSEMBLSYMBOLa1a2a3a4a5a6a7a9oa43oa44oa45oa46oa47oa48oa49oa50oa51oa53
ENSG00000000003TSPAN6 2.097943 9.552245 12.43869 6.879843 5.145786 6.859224 8.103091 9.579108 10.32720 7.837872 4.748403 12.82669 6.914725 20.49891 10.28306 4.575261 5.157741 9.177073
ENSG00000000005TNMD 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000
ENSG00000000419DPM1 232.172322 210.149382 232.02943 259.905198 199.828041 262.365332 160.441202 252.004238 133.64617 201.219541 141.027571 121.54820 117.550331 81.99565 109.68595 125.057144 119.917480 294.239910
ENSG00000000457SCYL3 139.862845 130.319909 132.99831 123.072756 91.766525 87.455111 191.232948 170.213389 140.93596 136.093953 177.590274 194.23281 165.953408 97.36983 147.39049 89.980140 108.312563 149.701007
ENSG00000000460C1orf112 34.965711 46.396617 44.01383 31.341509 31.732350 34.296122 35.653601 25.789907 35.23399 50.874912 37.987224 45.19883 27.658901 25.62364 27.42149 22.876307 27.078141 31.546189
ENSG00000000938FGR 2.797257 17.057580 18.65804 4.586562 52.315496 0.000000 28.360819 15.473944 26.72923 88.496696 28.490418 133.76410 13.829451 40.99782 41.13223 83.879792 68.340069 13.192043
In [9]:
SYMBOL<-exp$SYMBOL
aggdata_genename <-aggregate(exp[,3:dim(exp)[2]], by=list(SYMBOL), 
                             FUN=mean, na.rm=TRUE) ## calculate mean value per gene (by gene name)
paste("Number of transcript = ",dim(aggdata_genename)[1],sep="")
paste("Number of sample = ",dim(aggdata_genename)[2],sep="")
write.table(aggdata_genename,file="~/scRNA_seq/Deepa/exp_status_byGene.txt",row.names = F,col.names = T,sep="\t",quote = F)
'Number of transcript = 25553'
'Number of sample = 106'
In [41]:
Est_CellType <-read.csv(file="~/scRNA_seq/Deepa/CIBERSORT.Output_Job9.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",57),rep("OA",48))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:7])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
In [49]:
pdata_merge <-cbind(pdata[,c(1,2)],pdata[,3]+pdata[,4],pdata[,5],pdata[,6],pdata[,8])
colnames(pdata_merge)<-c("cluster_0","cluster_1","cluster_2/3","cluster_4","cluster_5","group")
head(pdata_merge)
cluster_0cluster_1cluster_2/3cluster_4cluster_5group
a10.3672779 0.115547020.2441837 0.190065690 A
a20.3809117 0.141788440.2383514 0.159450640 A
a30.3705765 0.058820770.3151485 0.163367640 A
a40.4592506 0.027376320.1809006 0.223981240 A
a50.2150827 0.086357960.5188332 0.101687690 A
a60.3649462 0.147544830.3704865 0.043238910 A
In [50]:
dat.m<-melt(pdata_merge,measure.vars =c('cluster_0','cluster_1','cluster_2/3','cluster_4','cluster_5'),  id.vars = 'group')
p<-ggplot(dat.m)+
  geom_boxplot(aes(color=group,y=value,x=variable))

pdf(file="~/scRNA_seq/Deepa/CellTypeProportion.pdf",width = 8, height = 4)
p+theme_bw()
dev.off()
pdf: 2
In [53]:
classes1=as.factor(pdata$group)
test<-pdata[,1:5]
stats_val <-matrix(ncol=4,nrow=5)
for(i in 1:5){
  stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
  stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata_merge)[1:5]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
Median_asthmaMedian_obese_asthmaDiff(median, A-OAg)pvalue(wilcox)
cluster_00.2510032 0.2120875 0.038915670.02183733
cluster_10.1354094 0.1474065 -0.011997080.01095745
cluster_2/30.3743561 0.4466935 -0.072337340.23148078
cluster_40.0000000 0.0000000 0.000000000.55741461
cluster_50.1392529 0.1094765 0.029776430.06388769

Eliminate cluster 6 (likely to be double or triple)

In [17]:
Est_CellType <-read.csv(file="~/scRNA_seq/Deepa/CIBERSORT.Output_Job10.csv",header = T)
row.names(Est_CellType) <-Est_CellType[,1]
Est_CellType<-Est_CellType[,-1]
group <-c(rep("A",57),rep("OA",48))
pdata <-cbind(Est_CellType[,1:7],group)
colorCodes <- c("A"="blue","OA"="red")
hc.cols <- hclust(dist((Est_CellType[,1:6])), "ward.D2")
dend <- as.dendrogram(hc.cols)
labels_colors(dend) <- colorCodes[pdata$group][order.dendrogram(dend)]
options(repr.plot.width = 12, repr.plot.height = 4)
plot(dend)
In [18]:
dat.m<-melt(pdata,measure.vars =c('cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','cluster_5'),  id.vars = 'group')
p<-ggplot(dat.m)+
  geom_boxplot(aes(color=group,y=value,x=variable))
p
In [21]:
classes1=as.factor(pdata$group)
test<-pdata[,1:6]
stats_val <-matrix(ncol=4,nrow=6)
for(i in 1:6){
  stats_val[i,1] <-median((as.double(test[which(classes1=="A"),i])))
  stats_val[i,2] <-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,3] <-median((as.double(test[which(classes1=="A"),i])))-median((as.double(test[which(classes1=="OA"),i])))
  stats_val[i,4] <-wilcox.test(as.double(test[which(classes1=="A"),i]),as.double(test[which(classes1=="OA"),i]))$p.value
}
row.names(stats_val)<-colnames(pdata)[1:6]
colnames(stats_val)<-c("Median_asthma","Median_obese_asthma","Diff(median, A-OAg)","pvalue(wilcox)")
stats_val
Median_asthmaMedian_obese_asthmaDiff(median, A-OAg)pvalue(wilcox)
cluster_00.2856210 0.2387547 0.046866280.12341826
cluster_10.1086801 0.1232659 -0.014585790.07962679
cluster_20.4318744 0.4548230 -0.022948620.51585940
cluster_30.0000000 0.0000000 0.000000000.19095310
cluster_40.1720163 0.1581392 0.013877120.41395792
cluster_50.0000000 0.0000000 0.00000000 NaN

May 11th, 2018

Deepa requested the signature genes of cluster 5.

In [3]:
PrintFindClustersParams(object = T_nmr)
Parameters used in latest FindClusters calculation run on: 2018-01-05 14:59:17
=============================================================================
Resolution: 0.6
-----------------------------------------------------------------------------
Modularity Function    Algorithm         n.start         n.iter
     1                   1                 100             10
-----------------------------------------------------------------------------
Reduction used          k.param          prune.SNN
     pca                 30                0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10

In [5]:
C5.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 5)
write.table(C5.markers,file="~/scRNA_seq/C5.markers.csv",sep=",")

Comments from Deepa regarding cluter5.

11 of the top 20 genes in Cluster 5 match with NK cells. Of these several overlap with cytotoxic T lymphocytes. So either the source of blood for the 10xGenomics analysis was someone with acute infection or their purity was impacted by an older lot of the T cell separation kit, where they used to not include NK cell markers. Cytotoxic cells are CD8+ so I am assuming that those were removed if the separation was done correctly. Either which way these cells are not CD4+ T cells, so I feel comfortable removing them from the figure.

We decided to add CD8+/CD45RA+ Naive Cytotoxic T Cells into the analysis

The data is from https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/naive_cytotoxic

NOTE

I got an error message. Error: vector memory exhausted (limit reached?)

So, I analyzed the data upto PCA step.

In [8]:
memory_t.data <- Read10X("memory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = memory_t.data), value = TRUE)
gene_name <-rownames(memory_t.data)
'%ni%' <- Negate('%in%')
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(memory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(memory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(memory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(memory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(memory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(memory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(memory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))
not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
sex_chr <-read.table("sex_chr_human.txt")
sex_chr2 <-sex_chr[,2]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
memory_t.data2<-memory_t.data[rownames(memory_t.data)%in%not_ribo_auto,]

memory_t <- CreateSeuratObject(raw.data = memory_t.data2, min.cells = 3, min.genes = 200, project = "memory_t")

naive_t.data <- Read10X("naive_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naive_t.data), value = TRUE)
gene_name <-rownames(naive_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(naive_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(naive_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(naive_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(naive_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(naive_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(naive_t.data), value = T)
hla.genes <-grep("^HLA", rownames(naive_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))

not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naive_t.data2<-naive_t.data[rownames(naive_t.data)%in%not_ribo_auto,]


naive_t <- CreateSeuratObject(raw.data = naive_t.data2, min.cells = 3, min.genes = 200, project = "naive_t")

regulatory_t.data <- Read10X("regulatory_t/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = regulatory_t.data), value = TRUE)
gene_name <-rownames(regulatory_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))

not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
regulatory_t.data2<-regulatory_t.data[rownames(regulatory_t.data)%in%not_ribo_auto,]

regulatory_t <- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "regulatory_t")


naiveCyto_t.data <- Read10X("Naive_cytotoxic/filtered_matrices_mex/hg19/")
mito.genes <- grep("^MT-", x = rownames(x = naiveCyto_t.data), value = TRUE)
gene_name <-rownames(naiveCyto_t.data)
not_mito <-gene_name[gene_name%ni%mito.genes=="TRUE"]
ribo1.genes <- grep("^MRPL", rownames(regulatory_t.data), value = T)
ribo2.genes <- grep("^RPL", rownames(regulatory_t.data), value = T)
RP1.genes <- grep("^RP1", rownames(regulatory_t.data), value = T)
RP4.genes <- grep("^RP4", rownames(regulatory_t.data), value = T)
RP3.genes <- grep("^RP3", rownames(regulatory_t.data), value = T)
RPS.genes <- grep("^RPs", rownames(regulatory_t.data), value = T)
hla.genes <-grep("^HLA", rownames(regulatory_t.data), value = T)
ribo.genes <-unique(c(ribo1.genes,ribo2.genes,RP1.genes,RP3.genes,RPS.genes,hla.genes))

not_ribo <-not_mito[not_mito%ni%ribo.genes=="TRUE"]
not_ribo_auto <-not_ribo[not_ribo%ni%sex_chr2=="TRUE"]
naiveCyto_t.data2<-naiveCyto_t.data[rownames(naiveCyto_t.data)%in%not_ribo_auto,]
naiveCyto_t<- CreateSeuratObject(raw.data = regulatory_t.data2, min.cells = 3, min.genes = 200, project = "naiveCyto_t")

memory_t@meta.data[, "protocol"] <- "memory_t"
naive_t@meta.data[, "protocol"] <- "naive_t"
regulatory_t@meta.data[, "protocol"] <- "regulatory_t"
naiveCyto_t@meta.data[, "protocol"] <- "naiveCyto_t"
In [10]:
T_nm <-MergeSeurat(naive_t,memory_t,add.cell.id1 = "T_n",add.cell.id2 ="T_m")
T_nmr <-MergeSeurat(T_nm,regulatory_t,add.cell.id2 ="T_r")
T_nmrc <-MergeSeurat(T_nmr,naiveCyto_t,add.cell.id2 ="T_c")

T_nmrc <- FilterCells(object = T_nmrc, subset.names = c("nGene"), 
                      low.thresholds = c(200), high.thresholds = c(3000))
T_nmrc <- NormalizeData(object = T_nmrc)
T_nmrc <- ScaleData(object = T_nmrc,vars.to.regress=c("nGene","nUMI"))
pdf("FindVariableGenes.pdf")
T_nmrc <- FindVariableGenes(object = T_nmrc, mean.function = ExpMean, dispersion.function = LogVMR, 
                            x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
dev.off()
saveRDS(T_nmrc,file = "T_nmrc.RDS")

cat("Identified variable genes across the single cells = ", length(x = T_nmrc@var.genes))
T_nmrc <- RunPCA(object = T_nmrc, pc.genes = T_nmrc@var.genes, do.print = FALSE, 
                 pcs.compute = 30)
saveRDS(T_nmrc,file = "T_nmrc2.RDS")
Regressing out: nGene, nUMI
Time Elapsed:  7.60751009782155 mins
Error: vector memory exhausted (limit reached?)
Traceback:

1. ScaleData(object = T_nmrc, vars.to.regress = c("nGene", "nUMI"))
2. RegressOutResid(object = object, vars.to.regress = vars.to.regress, 
 .     genes.regress = genes.use, use.umi = use.umi, model.use = model.use, 
 .     display.progress = display.progress, do.par = do.par, num.cores = num.cores)
3. unlist(x = data.resid[seq.int(from = 2, to = length(x = data.resid), 
 .     by = 2)])
In [5]:
load("/Volumes/home/msuzuki/scRNA/T_nmrc2.Robj")
In [3]:
options(repr.plot.width = 8, repr.plot.height = 4)
PCElbowPlot(T_nmrc, num.pc = 20)
PCAPlot(object = T_nmrc, dim.1 = 1, dim.2 = 2)
In [4]:
options(repr.plot.width = 8, repr.plot.height = 8)
PCHeatmap(object = T_nmrc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, 
          label.columns = FALSE, use.full = FALSE)
In [5]:
T_nmrc <- RunTSNE(T_nmrc, dims.use = 1:10, do.fast = T, check_duplicates = FALSE)
T_nmrc <- FindClusters(object = T_nmrc,reduction.type = "pca", dims.use = 1:10, resolution = 0.6, save.SNN = T,force.recalc=T)

PrintFindClustersParams(object = T_nmrc)
Parameters used in latest FindClusters calculation run on: 2018-05-16 12:04:44
=============================================================================
Resolution: 0.6
-----------------------------------------------------------------------------
Modularity Function    Algorithm         n.start         n.iter
     1                   1                 100             10
-----------------------------------------------------------------------------
Reduction used          k.param          prune.SNN
     pca                 30                0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10

Aug 20th, 2018

Deepa asked the gene signature file of cluter 4 and 6.

I loaded the robj "T_nmr2.Robj" to the R.

In [6]:
load("~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/T_nmr2.Robj")
library("Seurat")
PrintFindClustersParams(object = T_nmr)
p1 <- TSNEPlot(object = T_nmr,do.label = T,group.by = "protocol",pt.size = 0.2)
p2 <- TSNEPlot(object = T_nmr,do.label = T,pt.size = 0.2)
Parameters used in latest FindClusters calculation run on: 2018-01-05 14:59:17
=============================================================================
Resolution: 0.6
-----------------------------------------------------------------------------
Modularity Function    Algorithm         n.start         n.iter
     1                   1                 100             10
-----------------------------------------------------------------------------
Reduction used          k.param          prune.SNN
     pca                 30                0.0667
-----------------------------------------------------------------------------
Dims used in calculation
=============================================================================
1 2 3 4 5 6 7 8 9 10

In [7]:
C4.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 4)
C6.markers <- FindMarkers(T_nmr, only.pos = TRUE, ident.1 = 6)

write.table(C4.markers,file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/C4.markers.csv",sep=",")
write.table(C6.markers,file="~/Dropbox (EinsteinMed)/PBMC_paper/scRNA_seq/Deepa/C6.markers.csv",sep=",")

Mar 6th, 2019

Permutaion analysis to test if the candidate genes are enriched to lung function is statistically significant or not.

Deepa identified 157 candidate genes and of those 20 genes are significantly assoicaated with lung function. Therefore, I performed the analysis to see how many randomly selected 157 genes have significant association to lung function.

Loading data

In [1]:
data <-read.csv("~/Dropbox (EinsteinMed)/Deepa//normcounts_excpt_AJqc_final_cha36b30rma22a26.csv",sep=",",header=T)
In [6]:
head(data)
cat("This dataset contains 2 lung function values and ",dim(data)[1]-2, "gene expression profile from ", dim(data)[2]-1," individuals")
Xa1a2a3a4a5a6a7a9a10oa43oa44oa45oa46oa47oa48oa49oa50oa51oa53
erv 88.000000 92.000000 110.000000 156.000000 73.000000 102.000000 94.000000 43.000000 76.0000000 50.000000 82.000000 80.00000 72.000000 59.000000 60.000000 48 37.000000 80.0000 100.000000
fev/fvc 94.000000 94.000000 93.000000 86.000000 91.000000 84.000000 90.000000 64.000000 75.0000000 64.000000 76.000000 67.00000 85.000000 96.000000 87.000000 71 70.000000 76.0000 87.000000
ENSG00000223972 0.000000 0.000000 0.476121 0.000000 0.000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0 0.000000 0.0000 0.000000
ENSG00000227232 0.000000 0.000000 0.000000 1.520741 1.705876 3.413338 6.452136 3.666738 0.5226871 15.103064 7.373899 7.55320 8.499033 6.883557 5.098636 0 0.000000 11.5525 0.570652
ENSG00000278267 1.391147 2.037039 0.000000 1.520741 1.705876 0.000000 0.000000 1.466695 0.5226871 6.645348 1.418058 4.72075 1.821221 0.000000 0.000000 0 1.517587 0.0000 0.570652
ENSG00000243485 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0 0.000000 0.0000 0.000000
This dataset contains 2 lung function values and  60467 gene expression profile from  103  individuals
In [21]:
cat("Removing zero expressed genes for all samples")

clean <-data[which(rowSums(data[,2:dim(data)[2]])!=0),]
row.names(clean)<-clean[,1]
clean <- clean[,-1]

cat("After removing those genes, we have ",dim(clean)[1]-2, "gene expression profile from ", dim(clean)[2]," individuals")
Removing zero expressed genes for all samplesAfter removing those genes, we have  53901 gene expression profile from  103  individuals
In [22]:
cat("Calculate the associations between lung function and gene expression")

cor_result <-matrix(nrow=dim(clean)[1]-2,ncol=2)
row.names(cor_result)<-row.names(clean)[3:dim(clean)[1]]

for(i in 3:dim(clean)[1])
{
  erv <-as.numeric(clean[1,])
  fev <-as.numeric(clean[2,])
  test <-as.numeric(clean[i,])
  cor_result[i-2,1] <-unlist(cor.test(erv,test)[3])
  cor_result[i-2,2] <-unlist(cor.test(fev,test)[3])
}
Calculate the associations between lung function and gene expression
In [23]:
summary(cor_result)
       V1                  V2           
 Min.   :0.0000004   Min.   :0.0000042  
 1st Qu.:0.1717232   1st Qu.:0.0840595  
 Median :0.3585358   Median :0.2376174  
 Mean   :0.4075261   Mean   :0.3402060  
 3rd Qu.:0.6244445   3rd Qu.:0.5628221  
 Max.   :0.9999921   Max.   :0.9999896  
In [24]:
head(cor_result)
ENSG000002239727.975091e-010.85999697
ENSG000002272326.624005e-010.73707955
ENSG000002782678.671255e-010.08155984
ENSG000002434851.353068e-050.74091180
ENSG000002403612.351826e-010.56742587
ENSG000001860927.511719e-020.43249990
In [31]:
per_result <-matrix(nrow=1000,ncol=3)
for(i in 1:1000){
  rnd_num <-sample(dim(cor_result)[1],157)
  per_result[i,1] <-length(which(cor_result[rnd_num,1] < 0.05))
  per_result[i,2] <-length(which(cor_result[rnd_num,2] < 0.05))
  per_result[i,3] <-length(which((cor_result[rnd_num,2] < 0.05)&(cor_result[rnd_num,1] < 0.05)))
}
In [35]:
options(repr.plot.width = 4, repr.plot.height = 4)
hist(per_result[,1],main="ERV",breaks =25,xlim =c(0,60),xlab = "")
abline(v=39,col=2)

cat("p-value= ",length(which(per_result[,1]>(39)))+1/(1000+1))
p-value=  0.000999001
In [36]:
hist(per_result[,2],main="FEV/FVC",breaks =25,xlim =c(0,60),xlab = "")
abline(v=55,col=2)

cat("p-value= ",length(which(per_result[,2]>(24+31)))+1/(1000+1))
p-value=  0.000999001
In [37]:
hist(per_result[,3],main="ERV&FEVV/FVC",breaks =15,xlim =c(0,60),xlab = "")
abline(v=24,col=2)

cat("p-value= ",length(which(per_result[,3]>(24)))+1/(1000+1))
p-value=  0.000999001